This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Write code to create a new data frame,called 'pf.fc_by_age_gender', that contains information on each age AND gender group.
The data frame should contain the following variables:
mean_friend_count,
median_friend_count,
n (the number of users in each age and gender grouping)
pf=read.csv("pseudo_facebook.tsv",sep='\t')
suppressMessages(library(dplyr))
pf.fc_by_age_gender<-pf %>%
group_by(age,gender) %>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=as.numeric(median(friend_count)),
n=n()) %>%
ungroup()%>%
arrange(age)
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender mean_friend_count median_friend_count n
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
Create a line graph showing the median friend count over the ages
for each gender.
library("ggplot2")
ggplot(aes(y=median_friend_count,x=age),data=subset(pf.fc_by_age_gender,!is.na(gender)))+geom_line(aes(color=gender))
str(pf.fc_by_age_gender)
## Classes 'tbl_df', 'tbl' and 'data.frame': 274 obs. of 5 variables:
## $ age : int 13 13 14 14 15 15 15 16 16 17 ...
## $ gender : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 NA 1 2 1 ...
## $ mean_friend_count : num 259 102 362 164 539 ...
## $ median_friend_count: num 148 55 224 92.5 276 ...
## $ n : int 193 291 847 1078 1139 1478 1 1238 1848 1236 ...
library("reshape2")
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender mean_friend_count median_friend_count n
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
pf.fc_by_age_gender.wide<-dcast(pf.fc_by_age_gender,age~gender,value.var="median_friend_count")
str(pf.fc_by_age_gender.wide)
## 'data.frame': 101 obs. of 4 variables:
## $ age : int 13 14 15 16 17 18 19 20 21 22 ...
## $ female: num 148 224 276 258 246 ...
## $ male : num 55 92.5 106.5 136 125 ...
## $ NA : num NA NA 116 NA 106 ...
head(pf.fc_by_age_gender.wide)
## age female male NA
## 1 13 148.0 55.0 NA
## 2 14 224.0 92.5 NA
## 3 15 276.0 106.5 116.0
## 4 16 258.5 136.0 NA
## 5 17 245.5 125.0 106.5
## 6 18 243.0 122.0 NA
Plot the ratio of the female to male median friend counts using the data frame pf.fc_by_age_gender.wide.
Add a horizontal line to the plot with a y intercept of 1, which will be the base line
pf.fc_by_age_gender.wide$ratio=pf.fc_by_age_gender.wide$female/pf.fc_by_age_gender.wide$male
head(pf.fc_by_age_gender.wide)
## age female male NA ratio
## 1 13 148.0 55.0 NA 2.690909
## 2 14 224.0 92.5 NA 2.421622
## 3 15 276.0 106.5 116.0 2.591549
## 4 16 258.5 136.0 NA 1.900735
## 5 17 245.5 125.0 106.5 1.964000
## 6 18 243.0 122.0 NA 1.991803
ggplot(aes(x=age,y=ratio),data=pf.fc_by_age_gender.wide)+geom_line()+
geom_hline(yintercept=1,color='red',linetype=2,alpha=0.05)
Create a variable called year_joined in the pf data frame using the variable tenure and 2014 as the reference year.
The variable year joined should contain the year that a user joined facebook.
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
pf$tenure_in_years<-pf$tenure/365
head(pf)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## friendships_initiated likes likes_received mobile_likes
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## mobile_likes_received www_likes www_likes_received tenure_in_years
## 1 0 0 0 0.72876712
## 2 0 0 0 0.01643836
## 3 0 0 0 0.03561644
## 4 0 0 0 0.25479452
## 5 0 0 0 0.22465753
## 6 0 0 0 0.04109589
pf$year_joined=floor(2014-pf$tenure_in_years)
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
Create a new variable in the data frame called year_joined.bucket by using the cut function on the variable year_joined.
pf$year_joined.bucket=cut(pf$year_joined,breaks=c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. This means you should have four different lines on your plot.
You should subset the data to exclude the users whose year_joined.bucket is NA.
ggplot(aes(x=age,y=friend_count),data=subset(pf,!is.na(year_joined.bucket)))+ geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
(1) Add another geom_line to code below to plot the grand mean of the friend count vs age.
(2) Exclude any users whose year_joined.bucket is NA.
(3) Use a different line type for the grand mean.
ggplot(aes(x=age,y=friend_count),data=subset(pf,!is.na(year_joined.bucket)))+ geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)+geom_line(stat="summary",fun.y=mean,linetype=2)
str(pf)
## 'data.frame': 99003 obs. of 18 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tenure_in_years : num 0.7288 0.0164 0.0356 0.2548 0.2247 ...
## $ year_joined : num 2013 2013 2013 2013 2013 ...
## $ year_joined.bucket : Factor w/ 4 levels "(2004,2009]",..: 4 4 4 4 4 4 4 4 4 4 ...
pf_req<-subset(pf,pf$tenure>1)
pf_req$friends_making_rate=pf_req$friend_count/pf_req$tenure
median(pf_req$friends_making_rate)
## [1] 0.2204301
max(pf_req$friends_making_rate)
## [1] 417
Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.
ggplot(aes(x=tenure,y=friends_making_rate),data=subset(pf_req,!is.na(year_joined.bucket)))+geom_line(aes(color=year_joined.bucket),stat='summary',fun.y=mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
library(GGally)
##
## Attaching package: 'GGally'
##
## The following object is masked from 'package:dplyr':
##
## nasa
yo=read.csv("yogurt.csv",header=TRUE,stringsAsFactors = TRUE)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
yo$id<-factor(yo$id)
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
ggplot(aes(x=price),data=yo)+geom_histogram(binwidth=2)
ggplot(aes(x=time,y=price),data=yo)+geom_point()
set.seed(210)
sample.ids<-sample(levels(yo$id),16)
library(ggplot2)
data(diamonds)
Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
ggplot(aes(x=price),data=diamonds)+geom_histogram(aes(fill=cut),binwidth=0.1)+facet_wrap(~color)+scale_x_log10()+scale_fill_brewer(type='qual')
Create a scatterplot of diamond price vs. table and color the points by the cut of the diamond.
ggplot(aes(y=price,x=table),data=diamonds)+geom_point(aes(color=cut))+scale_color_brewer(type='qual')
Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by the clarity of diamonds. Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.
diamonds<-transform(diamonds,volume=x*y*z)
diamonds_req<-subset(diamonds,(volume<=quantile(diamonds$volume,0.9)))
ggplot(aes(y=price,x=volume),data=diamonds_req)+geom_point(aes(color=clarity))+scale_y_log10()+scale_color_brewer(type = 'div')
pf=read.csv("pseudo_facebook.tsv",sep='\t')
str(pf)
## 'data.frame': 99003 obs. of 15 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
head(pf)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## friendships_initiated likes likes_received mobile_likes
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## mobile_likes_received www_likes www_likes_received
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
pf$prop_initiated<-ifelse(pf$friend_count>0,pf$friendships_initiated/pf$friend_count,0)
Create a line graph of the median proportion of friendships initiated ('prop_initiated') vs.
tenure and color the line segment by year_joined.bucket.
pf$tenure_in_years<-pf$tenure/365
head(pf)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## friendships_initiated likes likes_received mobile_likes
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## mobile_likes_received www_likes www_likes_received prop_initiated
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## tenure_in_years
## 1 0.72876712
## 2 0.01643836
## 3 0.03561644
## 4 0.25479452
## 5 0.22465753
## 6 0.04109589
pf$year_joined=floor(2014-pf$tenure_in_years)
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
pf$year_joined.bucket=cut(pf$year_joined,breaks=c(2004,2009,2011,2012,2014))
ggplot(aes(x=tenure,y=prop_initiated),data=pf)+geom_line(stat='summary',fun.y='median',aes(color=year_joined.bucket))
## Warning: Removed 2 rows containing missing values (stat_summary).
Smooth the last plot you created of of prop_initiated vs tenure colored by year_joined.bucket. You can us e largerbins for tenure or add a smoother to the plot.
ggplot(aes(x=tenure,y=prop_initiated),data=pf)+geom_line(stat='summary',fun.y='median',aes(color=year_joined.bucket))+stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)
## Warning: Removed 2 rows containing missing values (stat_summary).
## Warning: Removed 2 rows containing missing values (stat_smooth).
Create a scatter plot of the price/carat ratio of diamonds. The variable x should be assigned to cut. The points should be colored by diamond color, and the plot should be faceted by clarity.
str(diamonds)
## 'data.frame': 53940 obs. of 11 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
## $ volume : num 38.2 34.5 38.1 46.7 51.9 ...
ggplot(aes(x=cut,y=price/carat),data=diamonds)+geom_jitter(aes(color=diamonds$color))+facet_wrap(~clarity)+scale_color_brewer(type = 'div')